In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as pt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from six import StringIO  
from sklearn.tree import export_graphviz
import pydotplus, graphviz
from IPython.display import Image  
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
#from sklearn.metrics import classification_report,plot_confusion_matrix,accuracy_score
In [2]:
#!pip install graphviz
#brew install graphviz
#!conda install python-graphviz
In [3]:
#!pip install pydotplus
In [4]:
#conda install -c anaconda graphviz   
In [18]:
data_train = pd.read_csv('/Users/huzaifkherani/Desktop/AML/Project/DATA/data.csv')
data_test = pd.read_csv('/Users/huzaifkherani/Desktop/AML/Project/DATA/test.csv')
In [19]:
data_train.head()
Out[19]:
Severity Safety_Score Days_Since_Inspection Total_Safety_Complaints Control_Metric Turbulence_In_gforces Cabin_Temperature Accident_Type_Code Max_Elevation Violations Adverse_Weather_Metric Accident_ID
0 Minor_Damage_And_Injuries 49.223744 14 22 71.285324 0.272118 78.04 2 31335.476824 3 0.424352 7570
1 Minor_Damage_And_Injuries 62.465753 10 27 72.288058 0.423939 84.54 2 26024.711057 2 0.352350 12128
2 Significant_Damage_And_Fatalities 63.059361 13 16 66.362808 0.322604 78.86 7 39269.053927 3 0.003364 2181
3 Significant_Damage_And_Serious_Injuries 48.082192 11 9 74.703737 0.337029 81.79 3 42771.499200 1 0.211728 5946
4 Significant_Damage_And_Fatalities 26.484018 13 25 47.948952 0.541140 77.16 3 35509.228515 2 0.176883 9054

Exploratory Data Analysis¶

In [20]:
data_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Severity                 10000 non-null  object 
 1   Safety_Score             10000 non-null  float64
 2   Days_Since_Inspection    10000 non-null  int64  
 3   Total_Safety_Complaints  10000 non-null  int64  
 4   Control_Metric           10000 non-null  float64
 5   Turbulence_In_gforces    10000 non-null  float64
 6   Cabin_Temperature        10000 non-null  float64
 7   Accident_Type_Code       10000 non-null  int64  
 8   Max_Elevation            10000 non-null  float64
 9   Violations               10000 non-null  int64  
 10  Adverse_Weather_Metric   10000 non-null  float64
 11  Accident_ID              10000 non-null  int64  
dtypes: float64(6), int64(5), object(1)
memory usage: 937.6+ KB
In [21]:
data_train.isnull().sum() # Checking if there is any null value in the dataset
Out[21]:
Severity                   0
Safety_Score               0
Days_Since_Inspection      0
Total_Safety_Complaints    0
Control_Metric             0
Turbulence_In_gforces      0
Cabin_Temperature          0
Accident_Type_Code         0
Max_Elevation              0
Violations                 0
Adverse_Weather_Metric     0
Accident_ID                0
dtype: int64
In [22]:
data_train.drop(['Accident_ID'],axis=1,inplace=True)
In [23]:
# Drop the target and check how the features correlate
data_train.drop("Severity", axis=1).corr()
Out[23]:
Safety_Score Days_Since_Inspection Total_Safety_Complaints Control_Metric Turbulence_In_gforces Cabin_Temperature Accident_Type_Code Max_Elevation Violations Adverse_Weather_Metric
Safety_Score 1.000000 -0.685386 0.057726 0.000564 0.019603 0.032747 0.173930 0.004451 0.041735 -0.107925
Days_Since_Inspection -0.685386 1.000000 -0.032055 -0.011963 -0.001564 -0.039140 -0.024718 0.000183 -0.016724 0.040804
Total_Safety_Complaints 0.057726 -0.032055 1.000000 -0.019665 0.066412 0.013590 0.034927 0.036855 -0.019005 -0.002713
Control_Metric 0.000564 -0.011963 -0.019665 1.000000 -0.643285 -0.008330 0.008385 -0.028375 -0.003284 -0.028296
Turbulence_In_gforces 0.019603 -0.001564 0.066412 -0.643285 1.000000 0.010757 -0.007565 0.047625 0.013171 0.039802
Cabin_Temperature 0.032747 -0.039140 0.013590 -0.008330 0.010757 1.000000 0.030682 -0.009186 0.018619 -0.026647
Accident_Type_Code 0.173930 -0.024718 0.034927 0.008385 -0.007565 0.030682 1.000000 0.019970 0.046379 -0.739361
Max_Elevation 0.004451 0.000183 0.036855 -0.028375 0.047625 -0.009186 0.019970 1.000000 -0.030513 0.173436
Violations 0.041735 -0.016724 -0.019005 -0.003284 0.013171 0.018619 0.046379 -0.030513 1.000000 -0.021578
Adverse_Weather_Metric -0.107925 0.040804 -0.002713 -0.028296 0.039802 -0.026647 -0.739361 0.173436 -0.021578 1.000000

Observations¶

Days since inspection has a strong -ve correlation with Safety Score¶

Turbulence in gforces has a strong -ve correlation with Control Metric¶

Accident type code has a weak +ve correlation with the Safety Score¶

Adverse Weather Metric has a strong -ve correlation with Accident Type Code¶

Adverse Weather Metric has a weak +ve correlation with Max Elevation¶

In [24]:
# Checking Corelation
pt.figure(figsize = (15, 7))
pt.subplot(1, 2, 1)
pt.title("Train Data")
sns.heatmap(data_train.corr())
pt.savefig('Correlation Heatmap.png')
In [25]:
inspec = data_train[data_train["Days_Since_Inspection"] == 1]
inspec
Out[25]:
Severity Safety_Score Days_Since_Inspection Total_Safety_Complaints Control_Metric Turbulence_In_gforces Cabin_Temperature Accident_Type_Code Max_Elevation Violations Adverse_Weather_Metric
840 Significant_Damage_And_Serious_Injuries 58.675799 1 2 75.387420 0.336308 75.96 6 29146.687854 3 0.007526
2283 Significant_Damage_And_Serious_Injuries 58.493151 1 5 75.387420 0.245792 76.11 6 16149.317704 3 0.003679
2611 Highly_Fatal_And_Damaging 65.342466 1 19 77.848678 0.361191 79.65 2 31661.628810 2 0.429535
7903 Highly_Fatal_And_Damaging 65.342466 1 8 58.204193 0.312146 79.43 2 28183.323130 2 0.382453
8152 Significant_Damage_And_Serious_Injuries 58.493151 1 20 67.046490 0.409514 79.72 7 25135.851480 2 0.002300
In [27]:
pt.figure()
pt.xticks(rotation = 90)
sns.countplot(data_train['Severity'])
#pt.savefig('Severity vs Count graph.png')
/Users/huzaifkherani/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
Out[27]:
<AxesSubplot:xlabel='Severity', ylabel='count'>
In [28]:
# "Accident_Type_Code" and "Severity" are a Categorical variable hence, removing it 
pt.figure(figsize=(16,6))
data_train.boxplot(column=['Safety_Score', 'Days_Since_Inspection', 'Total_Safety_Complaints', 'Control_Metric', 
                                   'Cabin_Temperature', 'Accident_Type_Code', 'Violations'])
#pt.savefig('Box plot 1.png')
Out[28]:
<AxesSubplot:>
In [29]:
pt.figure(figsize=(12,6))
data_train.boxplot(column=['Max_Elevation'])
#pt.savefig('Boxplot 2.png')
Out[29]:
<AxesSubplot:>
In [30]:
pt.figure(figsize=(12,6))
data_train.boxplot(column=['Turbulence_In_gforces', 'Adverse_Weather_Metric'])
#pt.savefig('Boxplot 3.png')
Out[30]:
<AxesSubplot:>
In [31]:
data_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Severity                 10000 non-null  object 
 1   Safety_Score             10000 non-null  float64
 2   Days_Since_Inspection    10000 non-null  int64  
 3   Total_Safety_Complaints  10000 non-null  int64  
 4   Control_Metric           10000 non-null  float64
 5   Turbulence_In_gforces    10000 non-null  float64
 6   Cabin_Temperature        10000 non-null  float64
 7   Accident_Type_Code       10000 non-null  int64  
 8   Max_Elevation            10000 non-null  float64
 9   Violations               10000 non-null  int64  
 10  Adverse_Weather_Metric   10000 non-null  float64
dtypes: float64(6), int64(4), object(1)
memory usage: 859.5+ KB
In [32]:
data_num = pd.DataFrame(data_train, columns = data_train.columns[data_train.dtypes == 'float64']) 
data_num.head()
Out[32]:
Safety_Score Control_Metric Turbulence_In_gforces Cabin_Temperature Max_Elevation Adverse_Weather_Metric
0 49.223744 71.285324 0.272118 78.04 31335.476824 0.424352
1 62.465753 72.288058 0.423939 84.54 26024.711057 0.352350
2 63.059361 66.362808 0.322604 78.86 39269.053927 0.003364
3 48.082192 74.703737 0.337029 81.79 42771.499200 0.211728
4 26.484018 47.948952 0.541140 77.16 35509.228515 0.176883
In [33]:
# Applying zscore
In [34]:
data_num=data_num.apply(zscore)
In [35]:
data_num.head()
Out[35]:
Safety_Score Control_Metric Turbulence_In_gforces Cabin_Temperature Max_Elevation Adverse_Weather_Metric
0 0.455303 0.516733 -0.901749 -0.699134 -0.070649 0.442701
1 1.275888 0.601122 0.349922 1.656279 -0.633736 0.253773
2 1.312673 0.102462 -0.485516 -0.401990 0.770528 -0.661939
3 0.384562 0.804422 -0.366593 0.659758 1.141883 -0.115208
4 -0.953841 -1.447221 1.316177 -1.018021 0.371883 -0.206638

Removing all records with z-score greater and lesser than 3 and -3 respectively.¶

In [36]:
floats = data_num.columns[data_num.dtypes == 'float64']
for columns in floats:
    indexNames_larger = data_num[data_num[columns]>3].index
    indexNames_lesser = data_num[data_num[columns]<-3].index
    # Delete these row indexes from dataFrame
    data_num.drop(indexNames_larger , inplace=True)
    data_num.drop(indexNames_lesser , inplace=True)
    data_train.drop(indexNames_larger , inplace=True)
    data_train.drop(indexNames_lesser , inplace=True)
In [37]:
data_num.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9507 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Safety_Score            9507 non-null   float64
 1   Control_Metric          9507 non-null   float64
 2   Turbulence_In_gforces   9507 non-null   float64
 3   Cabin_Temperature       9507 non-null   float64
 4   Max_Elevation           9507 non-null   float64
 5   Adverse_Weather_Metric  9507 non-null   float64
dtypes: float64(6)
memory usage: 519.9 KB

493 records were removed as they were considered outliers¶

Merging the scaled columns back to the original dataframe¶

In [38]:
data_train.drop(data_train.columns[data_train.dtypes == 'float64'],axis=1,inplace=True)
In [39]:
data_train.head()
Out[39]:
Severity Days_Since_Inspection Total_Safety_Complaints Accident_Type_Code Violations
0 Minor_Damage_And_Injuries 14 22 2 3
1 Minor_Damage_And_Injuries 10 27 2 2
2 Significant_Damage_And_Fatalities 13 16 7 3
3 Significant_Damage_And_Serious_Injuries 11 9 3 1
4 Significant_Damage_And_Fatalities 13 25 3 2
In [40]:
for column in data_num.columns:
    data_train[column]=data_num[column]
In [41]:
data_train.head()
Out[41]:
Severity Days_Since_Inspection Total_Safety_Complaints Accident_Type_Code Violations Safety_Score Control_Metric Turbulence_In_gforces Cabin_Temperature Max_Elevation Adverse_Weather_Metric
0 Minor_Damage_And_Injuries 14 22 2 3 0.455303 0.516733 -0.901749 -0.699134 -0.070649 0.442701
1 Minor_Damage_And_Injuries 10 27 2 2 1.275888 0.601122 0.349922 1.656279 -0.633736 0.253773
2 Significant_Damage_And_Fatalities 13 16 7 3 1.312673 0.102462 -0.485516 -0.401990 0.770528 -0.661939
3 Significant_Damage_And_Serious_Injuries 11 9 3 1 0.384562 0.804422 -0.366593 0.659758 1.141883 -0.115208
4 Significant_Damage_And_Fatalities 13 25 3 2 -0.953841 -1.447221 1.316177 -1.018021 0.371883 -0.206638

Label Encoding the Target Column¶

In [42]:
data_train['Severity'].unique()
Out[42]:
array(['Minor_Damage_And_Injuries', 'Significant_Damage_And_Fatalities',
       'Significant_Damage_And_Serious_Injuries',
       'Highly_Fatal_And_Damaging'], dtype=object)
In [43]:
encoder=LabelEncoder()
data_train['Severity']=encoder.fit_transform(data_train['Severity'])
In [44]:
data_train.head()
Out[44]:
Severity Days_Since_Inspection Total_Safety_Complaints Accident_Type_Code Violations Safety_Score Control_Metric Turbulence_In_gforces Cabin_Temperature Max_Elevation Adverse_Weather_Metric
0 1 14 22 2 3 0.455303 0.516733 -0.901749 -0.699134 -0.070649 0.442701
1 1 10 27 2 2 1.275888 0.601122 0.349922 1.656279 -0.633736 0.253773
2 2 13 16 7 3 1.312673 0.102462 -0.485516 -0.401990 0.770528 -0.661939
3 3 11 9 3 1 0.384562 0.804422 -0.366593 0.659758 1.141883 -0.115208
4 2 13 25 3 2 -0.953841 -1.447221 1.316177 -1.018021 0.371883 -0.206638
In [45]:
# Checking the unique values for dependent Variable (Severity)
data_train.Severity.unique()
Out[45]:
array([1, 2, 3, 0])
In [46]:
# Checking the Unique Values in Accident_Type_Code
data_train.Accident_Type_Code.unique()
Out[46]:
array([2, 7, 3, 4, 1, 6, 5])
In [47]:
# Checking the Unqiue Values in Violations
data_train.Violations.unique()
Out[47]:
array([3, 2, 1, 0, 4, 5])
In [48]:
# Checking the Unqiue Values in days since inspection
data_train.Days_Since_Inspection.unique()
Out[48]:
array([14, 10, 13, 11, 15, 18,  5,  6, 12,  7,  8, 17,  9, 16, 20, 19, 21,
        3,  4,  1, 22,  2, 23])
In [49]:
data_train['Severity'].describe()
# A description (4 level factor) on the severity of the crash
Out[49]:
count    9507.000000
mean        1.416430
std         1.183658
min         0.000000
25%         0.000000
50%         1.000000
75%         3.000000
max         3.000000
Name: Severity, dtype: float64
In [50]:
data_train['Safety_Score'].describe() 
# It gives a measure of how safe the plane was deemed to be.
Out[50]:
count    9507.000000
mean       -0.000848
std         0.983860
min        -2.595013
25%        -0.690688
50%        -0.034219
75%         0.659034
max         2.925549
Name: Safety_Score, dtype: float64
In [51]:
data_train['Days_Since_Inspection'].describe() 
# It gives measure of how long the plane without inspection before incident
Out[51]:
count    9507.000000
mean       12.957926
std         3.514377
min         1.000000
25%        11.000000
50%        13.000000
75%        15.000000
max        23.000000
Name: Days_Since_Inspection, dtype: float64
In [52]:
data_train['Total_Safety_Complaints'].describe()
# No. of complaints from mechanics prior to accident.
Out[52]:
count    9507.000000
mean        6.497633
std         6.886067
min         0.000000
25%         2.000000
50%         4.000000
75%         9.000000
max        54.000000
Name: Total_Safety_Complaints, dtype: float64
In [53]:
data_train['Control_Metric'].describe()
# An estimation of how much control the pilot had during the incident given the factors at play.
Out[53]:
count    9507.000000
mean        0.022516
std         0.972893
min        -2.966217
25%        -0.662790
50%         0.052596
75%         0.697018
max         2.933317
Name: Control_Metric, dtype: float64
In [54]:
data_train['Turbulence_In_gforces'].describe()
# Recorded turbulence experienced at the time of accident.
Out[54]:
count    9507.000000
mean       -0.038054
std         0.942622
min        -2.040443
25%        -0.729310
50%        -0.149557
75%         0.547633
max         2.960296
Name: Turbulence_In_gforces, dtype: float64
In [55]:
data_train['Cabin_Temperature'].describe()
# Last recorded temp before incident.
Out[55]:
count    9507.000000
mean       -0.030554
std         0.947306
min        -1.894959
25%        -0.731747
50%        -0.162825
75%         0.551047
max         2.975310
Name: Cabin_Temperature, dtype: float64
In [56]:
data_train['Max_Elevation'].describe()
# Height from the ground in mts.
Out[56]:
count    9507.000000
mean       -0.034620
std         0.973693
min        -2.984254
25%        -0.674868
50%        -0.032461
75%         0.620405
max         2.995312
Name: Max_Elevation, dtype: float64
In [57]:
data_train['Violations'].describe()
# Number of Violations aircraft received during inspection.
Out[57]:
count    9507.000000
mean        2.011255
std         1.037271
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max         5.000000
Name: Violations, dtype: float64
In [58]:
sns.barplot(x = "Days_Since_Inspection", y = "Severity", data = data_train)
#pt.savefig('Days since inspection vs Severity.png')
Out[58]:
<AxesSubplot:xlabel='Days_Since_Inspection', ylabel='Severity'>
In [59]:
#sns.barplot(x = "Safety_Score", y = "Severity", data = data_train)
In [60]:
sns.barplot(x = "Accident_Type_Code", y = "Severity", data = data_train)
#pt.savefig('Accident type code vs Severity.png')
Out[60]:
<AxesSubplot:xlabel='Accident_Type_Code', ylabel='Severity'>

Feature Engineering¶

In [61]:
data_train['Total_Safety_Complaints'] = np.power(2, data_train['Total_Safety_Complaints'])
data_train['Days_Since_Inspection'] = np.power(2, data_train['Days_Since_Inspection'])
data_train['Safety_Score'] = np.power(2, data_train['Safety_Score'])
In [63]:
X=data_train.drop(['Severity'],axis=1)
In [66]:
y=data_train['Severity']

Splitting Data into Train and Test¶

In [67]:
# Split dataset into training set and Validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1) # 90% training and 10% test
In [68]:
print(X_train.shape)
print(X_test.shape)
(8556, 10)
(951, 10)

Decision Tree¶

In [69]:
#making the instance
from sklearn.model_selection import GridSearchCV
model= DecisionTreeClassifier(random_state=1234)

#Hyper Parameters Set
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
          'min_samples_split': [5,10,15,20,25,50,100], 
          'min_samples_leaf':[5,6,7,8,9,10,11],
             'max_depth':[5,10,15,25,100],
             'criterion':['gini','entropy']}


# Create grid search object
clf1 = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)

# Fit on data
best_clf_dt = clf1.fit(X_train, y_train)

#Predict
predictions = best_clf_dt.predict(X_test)

print("Accuracy", accuracy_score(y_test,predictions))
print("CLASSIFICATION - REPORT \n")
print("Confusion matrix \n",confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
Accuracy 0.7886435331230284
CLASSIFICATION - REPORT 

Confusion matrix 
 [[233  24   6  21]
 [ 17 189   9  21]
 [ 12  11 126  11]
 [ 24  32  13 202]]
              precision    recall  f1-score   support

           0       0.81      0.82      0.82       284
           1       0.74      0.80      0.77       236
           2       0.82      0.79      0.80       160
           3       0.79      0.75      0.77       271

    accuracy                           0.79       951
   macro avg       0.79      0.79      0.79       951
weighted avg       0.79      0.79      0.79       951

In [70]:
clf1.best_estimator_
Out[70]:
DecisionTreeClassifier(criterion='entropy', max_depth=15, max_features='auto',
                       min_samples_leaf=5, min_samples_split=20,
                       random_state=1234)
In [71]:
dot_data = StringIO()  
export_graphviz(clf1.best_estimator_, out_file=dot_data, filled=True,rounded=True,
                feature_names=X.columns,
                class_names=['Highly_Fatal_And_Damaging','Significant_Damage_And_Serious_Injuries', 'Minor_Damage_And_Injuries','Significant_Damage_And_Fatalities' ]) 
                 
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.43098 to fit

Out[71]:

Random Forest¶

In [72]:
from sklearn.ensemble import RandomForestClassifier

#making the instance
model= RandomForestClassifier(random_state=1234)

#Hyper Parameters Set
param_grid = {'criterion':['gini','entropy'],
          'n_estimators':[1,2,3,4,5],
          'min_samples_leaf':[1,2,3],
          'min_samples_split':[3,4,5,6,7]}

# Create grid search object
clf = GridSearchCV(model, param_grid=param_grid, n_jobs=-1, cv=5)

# Fit on data
best_clf_rf = clf.fit(X_train, y_train)

#Predict
predictions = best_clf_rf.predict(X_test)

#Check Prediction Score
print("Accuracy of Random Forest: ",accuracy_score(y_test, predictions))

#Print Classification Report
print("Confusion matrix \n",confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
Accuracy of Random Forest:  0.8958990536277602
Confusion matrix 
 [[260   9   7   8]
 [  5 222   3   6]
 [  5  14 135   6]
 [ 20  14   2 235]]
              precision    recall  f1-score   support

           0       0.90      0.92      0.91       284
           1       0.86      0.94      0.90       236
           2       0.92      0.84      0.88       160
           3       0.92      0.87      0.89       271

    accuracy                           0.90       951
   macro avg       0.90      0.89      0.89       951
weighted avg       0.90      0.90      0.90       951

Gradient Boosting¶

In [73]:
param_grid = {"n_estimators":[10,20,40,100],'max_depth':[3,4,5,6]}
In [74]:
gb_model = GradientBoostingClassifier()
In [75]:
grid = GridSearchCV(gb_model,param_grid)
In [76]:
grid.fit(X_train,y_train)
Out[76]:
GridSearchCV(estimator=GradientBoostingClassifier(),
             param_grid={'max_depth': [3, 4, 5, 6],
                         'n_estimators': [10, 20, 40, 100]})
In [77]:
grid.best_params_
Out[77]:
{'max_depth': 6, 'n_estimators': 100}
In [78]:
predictions = grid.predict(X_test)
In [79]:
predictions
Out[79]:
array([2, 0, 0, 2, 2, 3, 3, 0, 2, 0, 3, 3, 3, 0, 3, 3, 0, 3, 3, 2, 3, 0,
       2, 0, 1, 2, 0, 0, 1, 3, 1, 0, 0, 1, 1, 0, 2, 0, 3, 1, 2, 2, 3, 0,
       2, 3, 3, 0, 1, 2, 0, 1, 3, 0, 3, 1, 0, 2, 1, 1, 2, 2, 0, 1, 0, 3,
       0, 2, 3, 0, 2, 1, 1, 2, 2, 0, 0, 0, 1, 2, 3, 1, 1, 2, 2, 3, 3, 3,
       0, 1, 3, 0, 0, 3, 0, 0, 1, 0, 1, 3, 1, 1, 3, 0, 2, 1, 1, 2, 3, 1,
       1, 3, 3, 0, 0, 2, 1, 3, 1, 1, 2, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 3,
       0, 0, 3, 0, 0, 3, 1, 3, 3, 1, 3, 3, 3, 0, 3, 3, 0, 3, 1, 0, 0, 1,
       1, 1, 1, 3, 1, 1, 0, 1, 1, 3, 2, 0, 1, 2, 1, 0, 3, 0, 3, 1, 2, 0,
       3, 3, 0, 1, 0, 1, 0, 3, 0, 1, 1, 2, 3, 1, 0, 3, 0, 1, 0, 0, 1, 0,
       0, 2, 3, 2, 3, 3, 1, 1, 1, 3, 2, 2, 3, 0, 0, 1, 3, 0, 3, 2, 1, 3,
       2, 0, 3, 1, 1, 1, 0, 2, 0, 0, 1, 1, 1, 0, 2, 2, 3, 1, 1, 3, 3, 3,
       2, 1, 0, 1, 2, 3, 1, 3, 3, 1, 0, 3, 1, 0, 0, 0, 1, 3, 3, 1, 3, 1,
       2, 3, 1, 0, 1, 1, 0, 1, 2, 2, 2, 2, 2, 3, 3, 1, 2, 0, 1, 1, 1, 1,
       3, 3, 0, 0, 3, 0, 1, 3, 3, 1, 3, 0, 3, 3, 3, 0, 0, 2, 2, 1, 0, 1,
       1, 3, 2, 0, 0, 1, 3, 3, 3, 2, 0, 0, 0, 2, 0, 0, 3, 3, 3, 3, 2, 0,
       3, 0, 1, 0, 0, 3, 0, 3, 2, 3, 3, 1, 1, 3, 1, 3, 0, 1, 0, 3, 0, 0,
       1, 0, 3, 1, 1, 1, 3, 1, 2, 3, 0, 2, 1, 3, 1, 3, 1, 0, 0, 2, 0, 1,
       0, 0, 3, 1, 2, 0, 3, 3, 3, 3, 0, 2, 0, 0, 3, 3, 1, 2, 3, 0, 3, 1,
       1, 3, 1, 1, 3, 3, 1, 1, 0, 0, 0, 0, 1, 3, 0, 1, 3, 0, 0, 2, 2, 2,
       0, 1, 1, 1, 0, 0, 0, 3, 1, 1, 2, 3, 1, 0, 0, 1, 3, 3, 2, 2, 3, 0,
       1, 2, 0, 0, 1, 0, 3, 0, 1, 3, 2, 2, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
       0, 0, 3, 3, 0, 3, 0, 2, 1, 3, 2, 0, 0, 0, 2, 0, 2, 2, 1, 1, 3, 0,
       3, 3, 3, 1, 3, 0, 2, 1, 0, 3, 0, 2, 2, 0, 3, 3, 3, 1, 2, 3, 3, 2,
       2, 3, 2, 2, 0, 0, 1, 1, 2, 0, 3, 1, 1, 0, 2, 0, 3, 3, 3, 3, 3, 2,
       1, 3, 0, 1, 3, 1, 3, 3, 2, 1, 0, 3, 3, 3, 0, 1, 0, 3, 2, 0, 1, 2,
       0, 0, 0, 0, 3, 3, 3, 1, 1, 2, 3, 3, 1, 1, 0, 3, 2, 3, 2, 3, 1, 3,
       2, 3, 1, 0, 1, 0, 3, 2, 3, 3, 1, 1, 3, 3, 0, 0, 2, 1, 3, 3, 1, 1,
       0, 0, 2, 2, 3, 2, 0, 0, 1, 0, 1, 2, 0, 3, 1, 1, 0, 2, 0, 0, 0, 0,
       1, 0, 1, 0, 3, 1, 0, 2, 1, 1, 1, 1, 0, 0, 3, 3, 1, 1, 1, 2, 3, 0,
       1, 3, 0, 2, 1, 2, 0, 0, 1, 3, 2, 2, 1, 1, 2, 0, 0, 0, 3, 1, 3, 2,
       1, 0, 1, 3, 1, 3, 2, 3, 1, 1, 1, 3, 3, 2, 3, 2, 0, 1, 0, 3, 1, 3,
       0, 2, 0, 0, 3, 0, 2, 0, 3, 2, 1, 2, 0, 0, 2, 0, 1, 0, 1, 2, 3, 2,
       0, 3, 3, 1, 3, 3, 1, 0, 2, 0, 3, 1, 3, 3, 0, 2, 2, 2, 1, 1, 0, 0,
       0, 2, 3, 2, 1, 3, 2, 3, 3, 3, 0, 3, 0, 0, 1, 3, 2, 1, 3, 3, 2, 1,
       0, 0, 0, 3, 0, 3, 0, 0, 3, 2, 0, 1, 0, 1, 2, 1, 0, 3, 2, 2, 0, 0,
       3, 0, 0, 0, 0, 2, 3, 1, 3, 1, 0, 2, 1, 3, 3, 0, 1, 3, 0, 3, 3, 3,
       3, 1, 2, 3, 1, 0, 3, 1, 0, 1, 0, 1, 1, 2, 2, 3, 3, 3, 2, 1, 2, 0,
       0, 0, 0, 0, 2, 2, 0, 3, 0, 2, 0, 3, 0, 2, 1, 0, 2, 0, 0, 0, 3, 3,
       3, 1, 1, 0, 1, 3, 3, 1, 2, 3, 3, 2, 0, 3, 3, 2, 3, 2, 3, 0, 0, 1,
       2, 0, 3, 3, 1, 2, 3, 3, 3, 0, 0, 0, 3, 1, 3, 0, 3, 2, 3, 3, 3, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 3, 3, 0, 2, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 2, 0, 3, 0, 1, 2, 2, 3, 0, 0, 1, 3, 3, 1,
       3, 2, 1, 0, 3, 3, 3, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 3, 1, 1, 1, 0,
       1, 3, 3, 1, 0])
In [80]:
print(classification_report(y_test,predictions))
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       284
           1       0.95      0.96      0.95       236
           2       0.96      0.94      0.95       160
           3       0.96      0.95      0.96       271

    accuracy                           0.95       951
   macro avg       0.95      0.95      0.95       951
weighted avg       0.95      0.95      0.95       951

In [81]:
grid.best_estimator_.feature_importances_
Out[81]:
array([0.2324318 , 0.00451244, 0.11729531, 0.00133324, 0.39295686,
       0.20007292, 0.00886474, 0.0051295 , 0.00602159, 0.03138161])
In [82]:
# example of grid searching key hyperparameters for gradient boosting on a classification dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
# define the model with default hyperparameters
model = GradientBoostingClassifier()
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
grid['subsample'] = [0.5, 0.7, 1.0]
grid['max_depth'] = [3, 7, 9]
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
# execute the grid search
grid_result = grid_search.fit(X, y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
Best: 0.945333 using {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.7}
0.529333 (0.089403) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.525000 (0.075840) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.524000 (0.072874) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.776333 (0.034687) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.5}
0.770667 (0.035957) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.7}
0.738667 (0.049982) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
0.831000 (0.032696) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.814000 (0.038349) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.761000 (0.043077) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
0.831333 (0.037659) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.5}
0.814667 (0.043261) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
0.773667 (0.034975) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}
0.537333 (0.112159) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.5}
0.535000 (0.105095) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.7}
0.531667 (0.095222) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 10, 'subsample': 1.0}
0.841000 (0.030260) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.5}
0.841333 (0.033539) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.7}
0.804000 (0.033625) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 50, 'subsample': 1.0}
0.870667 (0.031721) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.5}
0.868333 (0.031526) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}
0.809000 (0.029366) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}
0.887000 (0.031427) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.5}
0.881333 (0.029181) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.7}
0.810667 (0.032857) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 500, 'subsample': 1.0}
0.534667 (0.104267) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.5}
0.535333 (0.106293) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.7}
0.531333 (0.094012) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 10, 'subsample': 1.0}
0.833000 (0.033877) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.5}
0.841667 (0.026967) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.7}
0.808000 (0.029710) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 50, 'subsample': 1.0}
0.874667 (0.029970) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.5}
0.877667 (0.029853) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.7}
0.812333 (0.026543) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 100, 'subsample': 1.0}
0.892333 (0.027891) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.5}
0.885000 (0.029972) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.7}
0.812000 (0.027976) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 500, 'subsample': 1.0}
0.813000 (0.033779) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.802000 (0.038070) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.761333 (0.043107) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.825333 (0.039474) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.5}
0.814667 (0.038534) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.7}
0.773667 (0.034975) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
0.834000 (0.036111) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.813000 (0.038914) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.781333 (0.034325) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
0.846667 (0.033797) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.5}
0.833333 (0.033300) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
0.814333 (0.032730) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}
0.845000 (0.034132) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.5}
0.852333 (0.029403) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.7}
0.806667 (0.030912) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 10, 'subsample': 1.0}
0.881667 (0.031632) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.5}
0.875000 (0.030957) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.7}
0.810667 (0.033059) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 50, 'subsample': 1.0}
0.886333 (0.033315) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.5}
0.877333 (0.030214) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}
0.808333 (0.033475) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}
0.895333 (0.028371) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.5}
0.884333 (0.030186) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.7}
0.822000 (0.024276) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 500, 'subsample': 1.0}
0.851333 (0.025263) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.5}
0.862000 (0.032802) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.7}
0.812333 (0.028365) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 10, 'subsample': 1.0}
0.889667 (0.031568) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.5}
0.881000 (0.029704) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.7}
0.812000 (0.027857) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 50, 'subsample': 1.0}
0.892667 (0.031826) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.5}
0.884000 (0.029620) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.7}
0.812000 (0.026128) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 100, 'subsample': 1.0}
0.902000 (0.027129) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.5}
0.889000 (0.027123) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.7}
0.818667 (0.028952) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 500, 'subsample': 1.0}
0.823000 (0.028885) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.808000 (0.039107) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.780000 (0.035214) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.844333 (0.035934) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.5}
0.836333 (0.036008) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.7}
0.813333 (0.032283) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
0.857000 (0.030348) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.847000 (0.031849) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.836000 (0.034020) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
0.899000 (0.030039) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.5}
0.890000 (0.030000) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
0.878333 (0.029107) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}
0.870333 (0.024964) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.5}
0.865667 (0.030734) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.7}
0.808667 (0.029970) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 10, 'subsample': 1.0}
0.895667 (0.026164) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.5}
0.880333 (0.031250) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.7}
0.822667 (0.023795) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50, 'subsample': 1.0}
0.898000 (0.031979) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.5}
0.891000 (0.032797) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}
0.837000 (0.025968) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}
0.925000 (0.026045) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.5}
0.920667 (0.025940) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.7}
0.887000 (0.031953) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500, 'subsample': 1.0}
0.865667 (0.025519) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.5}
0.865333 (0.030192) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.7}
0.812000 (0.027857) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 10, 'subsample': 1.0}
0.895667 (0.032831) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.5}
0.886333 (0.031462) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.7}
0.816667 (0.030037) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 50, 'subsample': 1.0}
0.901000 (0.030039) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.5}
0.894667 (0.024322) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.7}
0.819333 (0.028394) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 100, 'subsample': 1.0}
0.921667 (0.024642) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.5}
0.919000 (0.027851) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.7}
0.830667 (0.027681) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 500, 'subsample': 1.0}
0.840333 (0.034106) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.840667 (0.025682) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.830000 (0.036515) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.887000 (0.029343) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.5}
0.886667 (0.031658) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.7}
0.880333 (0.033713) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
0.907000 (0.031744) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.900667 (0.030104) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.899000 (0.031236) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
0.926000 (0.026907) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.5}
0.927000 (0.026975) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
0.919000 (0.025475) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}
0.870667 (0.030869) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.5}
0.869667 (0.028575) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.7}
0.835333 (0.023055) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 10, 'subsample': 1.0}
0.916000 (0.026533) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.5}
0.912000 (0.030485) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.7}
0.879000 (0.033501) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50, 'subsample': 1.0}
0.926333 (0.026011) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.5}
0.927667 (0.027164) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}
0.905667 (0.032113) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}
0.944333 (0.022462) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.5}
0.944333 (0.024857) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.7}
0.926000 (0.025768) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500, 'subsample': 1.0}
0.869333 (0.030761) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.5}
0.878000 (0.024685) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.7}
0.823333 (0.027968) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 10, 'subsample': 1.0}
0.917000 (0.031107) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.5}
0.904000 (0.030725) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.7}
0.845667 (0.028482) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 50, 'subsample': 1.0}
0.928667 (0.027415) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.5}
0.932667 (0.031298) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.7}
0.847667 (0.039806) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100, 'subsample': 1.0}
0.939667 (0.026392) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.5}
0.945333 (0.021715) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.7}
0.858667 (0.047027) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 500, 'subsample': 1.0}
0.824667 (0.030739) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.842000 (0.039446) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.844667 (0.032014) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.823333 (0.036086) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.5}
0.869000 (0.036729) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.7}
0.896333 (0.033812) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
0.832667 (0.042734) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.876000 (0.040464) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.905000 (0.032326) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
0.807667 (0.064945) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.5}
0.903000 (0.034269) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
0.922667 (0.027195) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}
0.805667 (0.041608) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.5}
0.847333 (0.036600) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.7}
0.875667 (0.026418) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 10, 'subsample': 1.0}
0.832000 (0.045417) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.5}
0.901667 (0.029107) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.7}
0.918000 (0.029710) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 50, 'subsample': 1.0}
0.825333 (0.056729) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.5}
0.915667 (0.027771) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}
0.916333 (0.034301) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}
0.779333 (0.144751) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.5}
0.910667 (0.034731) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.7}
0.923000 (0.032265) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 500, 'subsample': 1.0}
0.805333 (0.046385) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.5}
0.854667 (0.026297) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.7}
0.881667 (0.028412) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 10, 'subsample': 1.0}
0.840333 (0.048955) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.5}
0.912000 (0.033106) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.7}
0.917000 (0.026096) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 50, 'subsample': 1.0}
0.804333 (0.120324) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.5}
0.909000 (0.028208) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.7}
0.918333 (0.024506) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 100, 'subsample': 1.0}
0.814667 (0.144770) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.5}
0.910667 (0.031826) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.7}
0.918333 (0.025309) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 500, 'subsample': 1.0}

Extreme Gradient Boosting¶

In [83]:
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

#Pipeline
pipe_XGB = Pipeline([('XGB', XGBClassifier())]) 

#Parameter-grid
param_grid = {'XGB__learning_rate':[0.1,0.2],'XGB__max_depth' :[5,10], 'XGB__gamma':[0.1,0.3]} 
 
#Using RandomSearchCV
Random_XGB = RandomizedSearchCV( pipe_XGB , param_distributions=param_grid, cv= 10, n_iter=3) 
#Fitting the data in the model
Random_XGB.fit(X_train, y_train)

print(" Best cross-validation score obtained is: {:.2f}". format( Random_XGB.best_score_)) 
print(" Best parameters as part of Gridsearch is: ", Random_XGB.best_params_) 
print(" Train set score obtained is: {:.2f}". format( Random_XGB.score( X_train, y_train)))
print(" Test set score obtained is: {:.2f}". format( Random_XGB.score( X_test, y_test)))
 Best cross-validation score obtained is: 0.96
 Best parameters as part of Gridsearch is:  {'XGB__max_depth': 10, 'XGB__learning_rate': 0.2, 'XGB__gamma': 0.1}
 Train set score obtained is: 1.00
 Test set score obtained is: 0.95
In [84]:
y_pred=Random_XGB.predict(X_test)
In [85]:
accuracy_score=metrics.accuracy_score(y_test,y_pred)
percision_score=metrics.precision_score(y_test,y_pred,average='macro')
recall_score=metrics.recall_score(y_test,y_pred,average='macro')
f1_score=metrics.f1_score(y_test,y_pred,average='macro')
print("The Accuracy of this model is {0:.2f}%".format(accuracy_score*100))
print("The Percision of this model is {0:.2f}%".format(percision_score*100))
print("The Recall score of this model is {0:.2f}%".format(recall_score*100))
print("The f1 score of this model is {0:.2f}%".format(f1_score*100))
The Accuracy of this model is 95.48%
The Percision of this model is 95.39%
The Recall score of this model is 95.45%
The f1 score of this model is 95.42%
In [86]:
Random_XGB.cv_results_
Out[86]:
{'mean_fit_time': array([0.85219514, 1.55653679, 0.94187734]),
 'std_fit_time': array([0.06057696, 0.08860524, 0.14080043]),
 'mean_score_time': array([0.0035188 , 0.00447943, 0.00354114]),
 'std_score_time': array([0.00052528, 0.00045283, 0.00053228]),
 'param_XGB__max_depth': masked_array(data=[5, 10, 5],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_XGB__learning_rate': masked_array(data=[0.1, 0.2, 0.2],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_XGB__gamma': masked_array(data=[0.3, 0.1, 0.1],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'XGB__max_depth': 5,
   'XGB__learning_rate': 0.1,
   'XGB__gamma': 0.3},
  {'XGB__max_depth': 10, 'XGB__learning_rate': 0.2, 'XGB__gamma': 0.1},
  {'XGB__max_depth': 5, 'XGB__learning_rate': 0.2, 'XGB__gamma': 0.1}],
 'split0_test_score': array([0.94392523, 0.96028037, 0.95093458]),
 'split1_test_score': array([0.94158879, 0.9521028 , 0.94859813]),
 'split2_test_score': array([0.95443925, 0.9614486 , 0.95911215]),
 'split3_test_score': array([0.9567757 , 0.97196262, 0.9567757 ]),
 'split4_test_score': array([0.95560748, 0.95794393, 0.95794393]),
 'split5_test_score': array([0.94976636, 0.96378505, 0.95327103]),
 'split6_test_score': array([0.94853801, 0.95789474, 0.95906433]),
 'split7_test_score': array([0.95555556, 0.9754386 , 0.96608187]),
 'split8_test_score': array([0.94269006, 0.96023392, 0.94736842]),
 'split9_test_score': array([0.94736842, 0.95204678, 0.94853801]),
 'mean_test_score': array([0.94962549, 0.96131374, 0.95476881]),
 'std_test_score': array([0.00544726, 0.00717658, 0.00572731]),
 'rank_test_score': array([3, 1, 2], dtype=int32)}

Predicting the test data¶

In [87]:
data_test.drop(['Accident_ID'],axis=1,inplace=True)
data_test.head()
Out[87]:
Safety_Score Days_Since_Inspection Total_Safety_Complaints Control_Metric Turbulence_In_gforces Cabin_Temperature Accident_Type_Code Max_Elevation Violations Adverse_Weather_Metric
0 19.497717 16 6 72.151322 0.388959 78.32 4 37949.724386 2 0.069692
1 58.173516 15 3 64.585232 0.250841 78.60 7 30194.805567 2 0.002777
2 33.287671 15 3 64.721969 0.336669 86.96 6 17572.925484 1 0.004316
3 3.287671 21 5 66.362808 0.421775 80.86 3 40209.186341 2 0.199990
4 10.867580 18 2 56.107566 0.313228 79.22 2 35495.525408 2 0.483696
In [88]:
data_test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Safety_Score             2500 non-null   float64
 1   Days_Since_Inspection    2500 non-null   int64  
 2   Total_Safety_Complaints  2500 non-null   int64  
 3   Control_Metric           2500 non-null   float64
 4   Turbulence_In_gforces    2500 non-null   float64
 5   Cabin_Temperature        2500 non-null   float64
 6   Accident_Type_Code       2500 non-null   int64  
 7   Max_Elevation            2500 non-null   float64
 8   Violations               2500 non-null   int64  
 9   Adverse_Weather_Metric   2500 non-null   float64
dtypes: float64(6), int64(4)
memory usage: 195.4 KB
In [89]:
num = pd.DataFrame(data_test, columns =data_test.columns[data_test.dtypes == 'float64']) 
num.head()
Out[89]:
Safety_Score Control_Metric Turbulence_In_gforces Cabin_Temperature Max_Elevation Adverse_Weather_Metric
0 19.497717 72.151322 0.388959 78.32 37949.724386 0.069692
1 58.173516 64.585232 0.250841 78.60 30194.805567 0.002777
2 33.287671 64.721969 0.336669 86.96 17572.925484 0.004316
3 3.287671 66.362808 0.421775 80.86 40209.186341 0.199990
4 10.867580 56.107566 0.313228 79.22 35495.525408 0.483696
In [90]:
num=num.apply(zscore)
In [91]:
data_test.drop(data_test.columns[data_test.dtypes == 'float64'],axis=1,inplace=True)
data_test.head()
Out[91]:
Days_Since_Inspection Total_Safety_Complaints Accident_Type_Code Violations
0 16 6 4 2
1 15 3 7 2
2 15 3 6 1
3 21 5 3 2
4 18 2 2 2
In [92]:
for column in num.columns:
    data_test[column]=num[column]
In [93]:
data_test.head()
Out[93]:
Days_Since_Inspection Total_Safety_Complaints Accident_Type_Code Violations Safety_Score Control_Metric Turbulence_In_gforces Cabin_Temperature Max_Elevation Adverse_Weather_Metric
0 16 6 4 2 -1.371727 0.592957 0.109134 -0.616620 0.586995 -0.467493
1 15 3 7 2 1.004384 -0.068431 -1.071998 -0.513424 -0.230758 -0.640138
2 15 3 6 1 -0.524519 -0.056478 -0.338031 2.567706 -1.561731 -0.636168
3 21 5 3 2 -2.367618 0.086956 0.389769 0.319513 0.825254 -0.131314
4 18 2 2 2 -1.901934 -0.809504 -0.538484 -0.284919 0.328201 0.600666
In [94]:
data_test['Total_Safety_Complaints'] = np.power(2, data_test['Total_Safety_Complaints'])
data_test['Days_Since_Inspection'] = np.power(2, data_test['Days_Since_Inspection'])
data_test['Safety_Score'] = np.power(2, data_test['Safety_Score'])

Predictions using Extreme Gradient Boosting¶

In [95]:
testPredictions=Random_XGB.predict(data_test)
In [96]:
data_test['Severity']=encoder.inverse_transform(testPredictions)
In [98]:
data_test.head()
Out[98]:
Days_Since_Inspection Total_Safety_Complaints Accident_Type_Code Violations Safety_Score Control_Metric Turbulence_In_gforces Cabin_Temperature Max_Elevation Adverse_Weather_Metric Severity
0 65536 64 4 2 0.386428 0.592957 0.109134 -0.616620 0.586995 -0.467493 Highly_Fatal_And_Damaging
1 32768 8 7 2 2.006087 -0.068431 -1.071998 -0.513424 -0.230758 -0.640138 Significant_Damage_And_Fatalities
2 32768 8 6 1 0.695191 -0.056478 -0.338031 2.567706 -1.561731 -0.636168 Significant_Damage_And_Serious_Injuries
3 2097152 32 3 2 0.193765 0.086956 0.389769 0.319513 0.825254 -0.131314 Highly_Fatal_And_Damaging
4 262144 4 2 2 0.267584 -0.809504 -0.538484 -0.284919 0.328201 0.600666 Significant_Damage_And_Fatalities
In [99]:
final_test = pd.read_csv('/Users/huzaifkherani/Desktop/AML/Project/DATA/test.csv')
In [100]:
final_test['Severity']=data_test['Severity']
In [101]:
final_test.head()
Out[101]:
Safety_Score Days_Since_Inspection Total_Safety_Complaints Control_Metric Turbulence_In_gforces Cabin_Temperature Accident_Type_Code Max_Elevation Violations Adverse_Weather_Metric Accident_ID Severity
0 19.497717 16 6 72.151322 0.388959 78.32 4 37949.724386 2 0.069692 1 Highly_Fatal_And_Damaging
1 58.173516 15 3 64.585232 0.250841 78.60 7 30194.805567 2 0.002777 10 Significant_Damage_And_Fatalities
2 33.287671 15 3 64.721969 0.336669 86.96 6 17572.925484 1 0.004316 14 Significant_Damage_And_Serious_Injuries
3 3.287671 21 5 66.362808 0.421775 80.86 3 40209.186341 2 0.199990 17 Highly_Fatal_And_Damaging
4 10.867580 18 2 56.107566 0.313228 79.22 2 35495.525408 2 0.483696 21 Significant_Damage_And_Fatalities
In [ ]: